Part i: Exploration

1. Load package and data

1.1 Load package

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

1.2 Load data

df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df %>% glimpse()
## Rows: 835
## Columns: 8
## $ R          <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G          <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B          <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness  <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue        <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response   <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ outcome    <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,…

2. Visualize the distributions of variables in the data set.

2.1Counts for categorical variables.

df <- df %>% mutate(outcome_event = ifelse(df$outcome == 1, 'event','non_event'))


df %>%
  count(outcome_event) %>%
  ggplot(mapping = aes(x = outcome_event, y = n)) +
  geom_bar(stat = "identity", fill = "steelblue",alpha = 0.8)

For categorical variables, the number of events is not balanced with the number of non_events. non_events are significantly more numerous.

2.2 Outcome interact with Saturation

Histograms or Density plots for continuous variables. Are the distributions Gaussian like?

data.frame(x = df$R) %>% 
  ggplot(aes(x)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Density Plot of Variable R", x = "Value of R", y = "Density")

df_bin_R <-df %>% mutate(R_bin = cut(R,
                            breaks = seq(0,300, by = 20),
                            include.lowest = TRUE))

df_bin_R %>% ggplot() +
             geom_bar(mapping = aes(x = R_bin),fill = "steelblue")

data.frame(x = df$G) %>% 
  ggplot(aes(x)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Density Plot of Variable G", x = "Value of G", y = "Density")

df_bin_G <-df %>% mutate(G_bin = cut(G,
                            breaks = seq(0,300, by = 20),
                            include.lowest = TRUE))

df_bin_G %>% ggplot() +
             geom_bar(mapping = aes(x = G_bin),fill = "steelblue")

data.frame(x = df$B) %>% 
  ggplot(aes(x)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Density Plot of Variable B", x = "Value of B", y = "Density")

df_bin_B <-df %>% mutate(B_bin = cut(B,
                            breaks = seq(0,300, by = 20),
                            include.lowest = TRUE))

df_bin_B %>% ggplot() +
             geom_bar(mapping = aes(x = B_bin),fill = "steelblue")

data.frame(x = df$Hue) %>% 
  ggplot(aes(x)) +
  geom_density(fill = "steelblue", alpha = 0.5) +
  labs(title = "Density Plot of Variable Hue", x = "Value of Hue", y = "Density")

df_bin_Hue <-df %>% mutate(Hue_bin = cut(Hue,
                            breaks = seq(0,40, by = 5),
                            include.lowest = TRUE))

df_bin_Hue %>% ggplot() +
             geom_bar(mapping = aes(x = Hue_bin),fill = "steelblue")

R,G and B continuous variables’s images look like the distributions Gaussian, but Hue’s imagre looks like not.

Condition (group) the continuous variables based on the categorical variables.

Are there differences in continuous variable distributions and continuous variable summary statistics based on categorical variable values?

df %>% ggplot(aes(x = R, y = response,color = Lightness)) +
                geom_line(size = 1.2) +
                facet_wrap(~Saturation)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

df %>% ggplot(aes(x = G, y = response,color = Lightness)) +
                geom_line(size = 1.2) +
                facet_wrap(~Saturation)

df %>% ggplot(aes(x = B, y = response,color = Lightness)) +
                geom_line(size = 1.2) +
                facet_wrap(~Saturation)

df %>% ggplot(aes(x = Hue, y = response,color = Lightness)) +
                geom_line(size = 1.2) +
                facet_wrap(~Saturation)

df %>% ggplot(aes(x = R + G + B + Hue, y = response,color = Lightness)) +
                geom_line(size = 1.2) +
                facet_wrap(~Saturation)

Overall the trend is the same. However, variable B has a slightly different trend in the case of BRIGHT classification and PURE classification.

Visualize the relationships between the continuous inputs, are they correlated?

df_continuous <- df %>%  
  subset(select = -c(outcome,Saturation,Lightness,outcome_event,response))

df_continuous %>%
  cor() %>%
  corrplot::corrplot(type = "upper")

Most of the variables are correlated, but the correlation between Hue and response and G is not significant.

Visualize the relationships between the continuous outputs (response and the LOGIT-transformed response, y) with respect to the continuous INPUTS.

df_logit <- df %>% 
  mutate(y = boot::logit( (response - 0) / (100 - 0) ) ) %>%
  subset(select = c(R, G, B, 
         Lightness, Saturation, Hue,response,
         y))

df_logit %>% glimpse()
## Rows: 835
## Columns: 8
## $ R          <dbl> 172, 26, 172, 28, 170, 175, 90, 194, 171, 122, 0, 88, 144, …
## $ G          <dbl> 58, 88, 94, 87, 66, 89, 78, 106, 68, 151, 121, 140, 82, 163…
## $ B          <dbl> 62, 151, 58, 152, 58, 65, 136, 53, 107, 59, 88, 58, 132, 50…
## $ Lightness  <chr> "dark", "dark", "dark", "dark", "dark", "dark", "dark", "da…
## $ Saturation <chr> "bright", "bright", "bright", "bright", "bright", "bright",…
## $ Hue        <dbl> 4, 31, 8, 32, 5, 6, 34, 10, 1, 21, 24, 22, 36, 16, 26, 12, …
## $ response   <dbl> 12, 10, 16, 10, 11, 16, 10, 19, 14, 25, 14, 19, 14, 38, 15,…
## $ y          <dbl> -1.9924302, -2.1972246, -1.6582281, -2.1972246, -2.0907411,…

Can you identify any clear trends? Do the trends depend on the categorical INPUTS?

df_logit %>% 
  ggplot(mapping = aes(x = R, y = y, color = Lightness))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~Saturation, scales = "free")+
  theme_bw()

df_logit %>% 
  ggplot(mapping = aes(x = G, y = y, color = Lightness))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~Saturation, scales = "free")+
  theme_bw()

df_logit %>% 
  ggplot(mapping = aes(x = B, y = y, color = Lightness))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~Saturation, scales = "free")+
  theme_bw()

df_logit %>% 
  ggplot(mapping = aes(x = Hue, y = y, color = Lightness))+
  geom_smooth(method = 'loess', formula = y ~ x, size = 0.4)+
  facet_wrap(~Saturation, scales = "free")+
  theme_bw()

df %>% pivot_longer(c(R,G,B,Hue)) %>%
  ggplot(mapping = aes(x = value, y = response)) + 
  geom_point(aes(col = Lightness)) + 
  geom_smooth(aes(fill = Lightness , col = Lightness), formula = y ~ x,method = lm) + 
  facet_wrap(~factor(name,levels = c("R","G","B","Hue")), scales = "free") +
  theme_bw()

Yes,I think we can observe a clear upward trend in R,G and B continuous variables, both among the classifications. But in Hue,

How can you visualize the behavior of the binary outcome with respect to the continuous inputs?

df %>%
  ggplot(mapping = aes(x = R, y = outcome))+
  geom_point(mapping = aes(color = outcome),size = 0.1)+
  facet_wrap(~Saturation, scales = "free")

df %>%
  ggplot(mapping = aes(x = G, y = outcome))+
  geom_point(mapping = aes(color = outcome),size = 0.1)+
  facet_wrap(~Saturation, scales = "free")

df %>%
  ggplot(mapping = aes(x = B, y = outcome))+
  geom_point(mapping = aes(color = outcome),size = 0.1)+
  facet_wrap(~Saturation, scales = "free")

df %>% 
  ggplot(mapping = aes(x = Saturation)) + 
  geom_bar(aes(fill = as.factor(outcome)),position = 'fill') + 
  scale_fill_brewer(name = 'outcome') + 
  theme_bw()

df %>% 
  ggplot(mapping = aes(x = Lightness)) + 
  geom_bar(aes(fill = as.factor(outcome)),position = 'fill') + 
  scale_fill_brewer(name = 'outcome') + 
  theme_bw()